# 练习，采集豆瓣250
import re
import time

import requests
import pprint as pp

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0'}

for i in range(0, 11):
    record = 25 * i
    url = 'https://movie.douban.com/top250?start=' + str(record) + '&filter='
    response = requests.get(url, headers=headers)
    content = response.text

    obj = re.compile(r'<div class="item">.*?<em.*?>(?P<rank>.*?)</em>.*?<span class="title">(?P<movie_name>.*?)</span>'
                     r'.*?导演: (?P<movie_director>.*?)&nbsp;'
                     r'.*?<span class="rating_num" property="v:average">(?P<movie_score>.*?)</span>',
                     re.S)
    result = obj.finditer(content)

    if result:
        for movie in result:
            tmp = {}
            tmp['rank'] = movie.group('rank')
            tmp['movie_name'] = movie.group('movie_name')
            tmp['movie_director'] = movie.group('movie_director')
            tmp['movie_score'] = movie.group('movie_score')

            print(tmp)

        time.sleep(5)